Load libraries

library(R.utils)
library(dplyr)
library(stringr)
library(Seurat)
library(purrr)
library(scCATCH)
library(ggplot2)
library(SingleR)
library(msigdbr)
library(tibble)
library(clusterProfiler)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(ggpubr)
library(openxlsx)
theme_set(theme_bw())

Obtain files

Download data

This project uses the data from Grosselin et al, 2019, which is uploaded to GEO.

mkdir -p data
cd data
wget -r -np -nd 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE117nnn/GSE117309/suppl/GSE117309_RAW.tar' -R "index.html*" # Omitting the index file

Decompress files

# Untar the main file
untar("data/GSE117309_RAW.tar", exdir="data/")

# Untar sc-RNAseq files
RNA_files <- list.files(path="data/",pattern="scRNA")
untar_RNA_files <- function(file){untar(paste0("data/",file), exdir="data/RNA_files")}
sapply(RNA_files, untar_RNA_files)

Read files into R

# Create function to load the files
load_scRNA_files <- function(model, organism){
  directory <- paste0("data/RNA_files/filtered_gene_bc_matrices_",model,"/",organism,"/")
  expression_matrix <- ReadMtx(
  mtx=paste0(directory, "matrix.mtx"),
  features=paste0(directory, "genes.tsv"),
  cells=paste0(directory, "barcodes.tsv"))
    # Correct gene names for easier automatization
  expression_matrix@Dimnames[[1]] <- str_remove_all(expression_matrix@Dimnames[[1]],
                                             pattern=paste0(organism,"_"))
  seurat_object <- CreateSeuratObject(counts = expression_matrix, project=model)

  return(seurat_object)
}

models <- c("HBCx-95","HBCx-95_CAPAR", "HBCx-22","HBCx22-TAMR")

scRNA_files_hg19 <- mapply(FUN=load_scRNA_files, models, organism="hg19")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
names(scRNA_files_hg19) <- paste0(names(scRNA_files_hg19), "_hg19")

scRNA_files_mm10 <- mapply(FUN=load_scRNA_files, models, organism="mm10")
names(scRNA_files_mm10) <- paste0(names(scRNA_files_mm10), "_mm10")

Analysis of human cells

Data integration

# Normalize and identify variable features
scRNA_files_hg19 <- lapply(X = scRNA_files_hg19, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# Select features repeatedly variable between the 4 datasets
features <- SelectIntegrationFeatures(object.list = scRNA_files_hg19)

# Select anchors for integration
brca_anchors <- FindIntegrationAnchors(object.list = scRNA_files_hg19, anchor.features = features)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2276 anchors
## Filtering anchors
##  Retained 1594 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2558 anchors
## Filtering anchors
##  Retained 1046 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 3666 anchors
## Filtering anchors
##  Retained 1570 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2201 anchors
## Filtering anchors
##  Retained 1215 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2766 anchors
## Filtering anchors
##  Retained 1635 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 3104 anchors
## Filtering anchors
##  Retained 2412 anchors
# Create the integrated data assay
scRNA_combined <- IntegrateData(anchorset=brca_anchors)
## Merging dataset 4 into 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 2 1 into 3 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data

Quality control

# Use the integrated assay as default
DefaultAssay(scRNA_combined) <- "RNA"

# Creating a new slot with the percentage of counts on mitochondrial reads


scRNA_combined[["percent.mt"]] <- PercentageFeatureSet(scRNA_combined, pattern="MT-")

# Violin plots of specific features
VlnPlot(scRNA_combined, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3,
        split.by="orig.ident")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.

plot_qc_hg38_1 <- VlnPlot(scRNA_combined, 
        features = c("nFeature_RNA"),
        split.by="orig.ident") +
  geom_hline(yintercept=c(200,6000), linetype="dashed", color = "firebrick4") + ylim(0,7000)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
plot_qc_hg38_2 <-  VlnPlot(scRNA_combined, 
        features = c("percent.mt"),
        split.by="orig.ident") +
  geom_hline(yintercept=10, linetype="dashed", color = "firebrick4") + ylim(0,50)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

We will remove cells with a percentage of mitochondrial reads too high (>5%) and with an abnormally high or low number of feature counts, that can suggest empty droplets or multiplets in the microfluidic system, respectively.

scRNA_combined <- subset(scRNA_combined, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 10)

VlnPlot(scRNA_combined, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3,
        split.by="orig.ident")

Visualization and clustering

Check integration

First we will explore the UMAPs without integration

# First let's do the same analysis without integration
scRNA_combined <- FindVariableFeatures(scRNA_combined)
scRNA_combined <- ScaleData(scRNA_combined, verbose = FALSE)
scRNA_combined <- RunPCA(scRNA_combined, npcs = 30, verbose = FALSE)
scRNA_combined <- RunUMAP(scRNA_combined, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:14:52 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:14:52 Read 3006 rows and found 30 numeric columns
## 17:14:52 Using Annoy for neighbor search, n_neighbors = 30
## 17:14:52 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:14:53 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f0639647fe
## 17:14:53 Searching Annoy index using 1 thread, search_k = 3000
## 17:14:54 Annoy recall = 100%
## 17:14:54 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:14:55 Found 2 connected components, falling back to 'spca' initialization with init_sdev = 1
## 17:14:55 Using 'irlba' for PCA
## 17:14:55 PCA: 2 components explained 46.18% variance
## 17:14:55 Scaling init to sdev = 1
## 17:14:55 Commencing optimization for 500 epochs, with 120564 positive edges
## 17:15:05 Optimization finished
scRNA_combined <- FindNeighbors(scRNA_combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined <- FindClusters(scRNA_combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3006
## Number of edges: 115260
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8814
## Number of communities: 7
## Elapsed time: 0 seconds
p1_hg38 <- DimPlot(scRNA_combined, reduction = "umap", group.by = "orig.ident") + ggtitle("hg19 pre-integration")
p2 <- DimPlot(scRNA_combined, reduction = "umap", label = TRUE, repel = TRUE)
p1_hg38 + p2

# Change the default to the integrated dataset
DefaultAssay(scRNA_combined) <- "integrated"

scRNA_combined <- ScaleData(scRNA_combined, verbose = FALSE)
scRNA_combined <- RunPCA(scRNA_combined, npcs = 30, verbose = FALSE)
scRNA_combined <- RunUMAP(scRNA_combined, reduction = "pca", dims = 1:30)
## 17:15:10 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:15:10 Read 3006 rows and found 30 numeric columns
## 17:15:10 Using Annoy for neighbor search, n_neighbors = 30
## 17:15:10 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:15:10 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f04ae15cc0
## 17:15:10 Searching Annoy index using 1 thread, search_k = 3000
## 17:15:11 Annoy recall = 100%
## 17:15:11 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:15:13 Initializing from normalized Laplacian + noise (using irlba)
## 17:15:13 Commencing optimization for 500 epochs, with 129870 positive edges
## 17:15:23 Optimization finished
scRNA_combined <- FindNeighbors(scRNA_combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined <- FindClusters(scRNA_combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3006
## Number of edges: 151631
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7946
## Number of communities: 5
## Elapsed time: 0 seconds
# Visualization
p2_hg38 <- DimPlot(scRNA_combined, reduction = "umap", group.by = "orig.ident") + ggtitle("hg19 post-integration")
p3 <- DimPlot(scRNA_combined, reduction = "umap", label = TRUE, repel = TRUE)

After normalization and integration, the UMAPs of all subsets look quite similar, so we can assume integration is correct and we can proceed forward. Once this has been verified, we can go back a few steps and refine our analysis

Determine the dimensionality of the dataset

scRNA_combined <- JackStraw(scRNA_combined, num.replicate = 100)
scRNA_combined <- ScoreJackStraw(scRNA_combined, dims=1:20)

JackStrawPlot(scRNA_combined, dims=1:20)
## Warning: Removed 28000 rows containing missing values (`geom_point()`).

ElbowPlot(scRNA_combined)

Based on these two graphs, we may consider using 13-15 PCs.

scRNA_combined <- FindNeighbors(scRNA_combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined <- FindClusters(scRNA_combined, resolution = 0.5) # Try different levels of resolution and chose the one that results in cluster with most biological sense
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3006
## Number of edges: 151631
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7946
## Number of communities: 5
## Elapsed time: 0 seconds
scRNA_combined <- RunUMAP(scRNA_combined, dims = 1:30)
## 17:17:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:37 Read 3006 rows and found 30 numeric columns
## 17:17:37 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:37 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:37 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f06347f0c
## 17:17:37 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:38 Annoy recall = 100%
## 17:17:39 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:40 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:40 Commencing optimization for 500 epochs, with 129870 positive edges
## 17:17:48 Optimization finished
DimPlot(scRNA_combined, reduction="umap")

scRNA_combined$orig.ident <- factor(x = scRNA_combined$orig.ident, levels = c("HBCx-95", "HBCx-95_CAPAR","HBCx-22","HBCx22-TAMR"))

plot_hg19_clustering <- DimPlot(scRNA_combined, reduction = "umap", split.by = "orig.ident", ncol=2) + ggtitle("Clustering in hg19 cells") + theme(plot.title = element_text(hjust=0.5))
plot_hg19_clustering

Finding marker genes

hg19_markers <- FindAllMarkers(scRNA_combined, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
hg19_markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 10 × 7
## # Groups:   cluster [5]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 2.32e-209      2.66  0.905 0.481 4.64e-206 0       KRT16   
##  2 8.55e-171      2.38  0.813 0.369 1.71e-167 0       KRT6B   
##  3 9.19e- 60      0.787 0.916 0.823 1.84e- 56 1       CLEC3A  
##  4 3.16e- 54      0.681 0.928 0.847 6.32e- 51 1       AGR2    
##  5 5.50e-155      1.98  0.963 0.7   1.10e-151 2       HIST1H4C
##  6 8.36e-214      1.87  0.968 0.63  1.67e-210 2       KIAA0101
##  7 4.88e-  7      1.41  0.733 0.727 9.76e-  4 3       SCGB2A2 
##  8 4.34e-  4      1.28  0.593 0.595 8.68e-  1 3       SCGB1D2 
##  9 1.41e- 91      2.31  0.957 0.704 2.83e- 88 4       UBE2C   
## 10 6.28e-156      2.22  0.996 0.677 1.26e-152 4       PTTG1
hg19_markers %>%
    group_by(cluster) %>%
    top_n(n = 5, wt = avg_log2FC) -> top5


plot_hg19_heatmap <- DoHeatmap(scRNA_combined, features = top5$gene) + NoLegend()

Functional analysis

Gene set enrichment analysis

MSigDB_df <- msigdbr(species = "Homo sapiens")
fgsea_sets<- MSigDB_df %>% split(x = .$gene_symbol, f = .$gs_name)



plot_GSEA <- function(cluster_number){
  genes <- hg19_markers %>% filter(cluster==as.character(cluster_number)) %>%
  arrange(desc(p_val_adj)) %>% 
  dplyr::select(gene, avg_log2FC)
  
  # Create dataframe with genes
  ranks <- deframe(genes)
  
  # Perform GSEA analysis
  fgseaRes <- fgsea(fgsea_sets,
                    stats = ranks,
                    minSize=10,
                    maxSize=500,
                    nperm=1000000)
  
  # Plot
  ggplot(fgseaRes %>% filter(padj < 0.01) %>% head(n= 20), aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill= NES < 7.5)) +
  coord_flip() +
  labs(x="Pathway",
       y="Normalized Enrichment Score",
       title=paste0("Hallmark pathways NES from GSEA for cluster",as.character(cluster_number)))
}

for (cluster in 0:5){plot_GSEA(cluster)}

Gene ontology

plot_GO <- function(cluster_number){
  genes <- hg19_markers %>% filter(cluster==as.character(cluster_number))
  
  # Calculate enrichment
  enrichment <- enrichGO(gene = genes$gene,
             OrgDb  = org.Hs.eg.db,
             keyType = "SYMBOL",
             ont  = "BP",
             pAdjustMethod = "BH",
             pvalueCutoff  = 0.01,
             qvalueCutoff  = 0.05)
  enrichment <- clusterProfiler::simplify(enrichment, cutoff=0.7, by="p.adjust", select_fun=min)
  
  GO_ggdata <- enrichment %>%
   as_data_frame() %>%
   arrange(Count)
  GO_ggdata$Description <- factor(GO_ggdata$Description, levels = GO_ggdata$Description)
  print(tail(GO_ggdata))

ggplot(GO_ggdata, aes(x = Description, y = Count, fill = p.adjust)) +
 geom_bar(stat = "identity") +
 scale_colour_viridis_d(begin=0,end=1) +
 coord_flip() +
 ylab("Number of genes") +
 xlab("GO Terms") +
 theme(axis.text.y = element_text(size=10))
}

for (cluster in 0:4){print(plot_GO(cluster))}
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0050900 leukocyte … 21/197  398/18… 1.22e- 9 3.17e- 7 2.41e- 7 ITGA6…    21
## 2 GO:0001667 ameboidal-… 21/197  492/18… 4.96e- 8 5.24e- 6 3.98e- 6 KRT16…    21
## 3 GO:0052547 regulation… 22/197  459/18… 2.86e- 9 6.04e- 7 4.59e- 7 CAV1/…    22
## 4 GO:0008544 epidermis … 27/197  362/18… 1.05e-15 1.18e-12 8.96e-13 LAMB3…    27
## 5 GO:0043588 skin devel… 28/197  302/18… 9.82e-19 1.66e-15 1.26e-15 KRT16…    28
## 6 GO:0042060 wound heal… 36/197  442/18… 5.30e-22 1.79e-18 1.36e-18 ITGB6…    36
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description    GeneR…¹ BgRatio  pvalue p.adj…²  qvalue geneID Count
##   <chr>      <fct>          <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>  <int>
## 1 GO:0001894 tissue homeos… 5/37    275/18… 1.87e-4 9.55e-3 6.75e-3 TFF3/…     5
## 2 GO:0097191 extrinsic apo… 6/37    224/18… 4.43e-6 1.00e-3 7.09e-4 KRT18…     6
## 3 GO:2001234 negative regu… 6/37    233/18… 5.55e-6 1.00e-3 7.09e-4 IFI6/…     6
## 4 GO:0009615 response to v… 6/37    409/18… 1.30e-4 7.65e-3 5.40e-3 BST2/…     6
## 5 GO:2001233 regulation of… 7/37    374/18… 6.93e-6 1.00e-3 7.09e-4 TNFSF…     7
## 6 GO:0010038 response to m… 8/37    360/18… 3.82e-7 2.24e-4 1.58e-4 SLC40…     8
## # … with abbreviated variable names ¹​GeneRatio, ²​p.adjust

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0045787 positive r… 25/181  362/18… 1.03e-14 2.60e-12 2.27e-12 UBE2C…    25
## 2 GO:0006260 DNA replic… 28/181  286/18… 2.19e-20 1.10e-17 9.60e-18 RPA3/…    28
## 3 GO:0098813 nuclear ch… 29/181  321/18… 4.09e-20 1.71e-17 1.50e-17 UBE2C…    29
## 4 GO:0140014 mitotic nu… 31/181  325/18… 3.63e-22 3.05e-19 2.66e-19 UBE2C…    31
## 5 GO:0007059 chromosome… 35/181  382/18… 2.18e-24 5.49e-21 4.79e-21 UBE2C…    35
## 6 GO:0000280 nuclear di… 37/181  481/18… 4.27e-23 5.37e-20 4.68e-20 UBE2C…    37
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description    GeneR…¹ BgRatio  pvalue p.adj…²  qvalue geneID Count
##   <chr>      <fct>          <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>  <int>
## 1 GO:0019318 hexose metabo… 4/25    229/18… 2.17e-4 7.80e-3 5.35e-3 PGK1/…     4
## 2 GO:0051402 neuron apopto… 5/25    255/18… 1.83e-5 9.86e-4 6.75e-4 EGLN3…     5
## 3 GO:0070997 neuron death   5/25    368/18… 1.05e-4 4.52e-3 3.10e-3 EGLN3…     5
## 4 GO:0071456 cellular resp… 6/25    151/18… 3.67e-8 7.92e-6 5.42e-6 NDRG1…     6
## 5 GO:0001666 response to h… 8/25    296/18… 2.82e-9 1.71e-6 1.17e-6 NDRG1…     8
## 6 GO:0036293 response to d… 8/25    309/18… 3.96e-9 1.71e-6 1.17e-6 NDRG1…     8
## # … with abbreviated variable names ¹​GeneRatio, ²​p.adjust

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0140694 non-membra… 25/151  386/18… 5.84e-16 8.73e-14 7.70e-14 BIRC5…    25
## 2 GO:0044772 mitotic ce… 26/151  473/18… 7.28e-15 6.80e-13 6.00e-13 CDKN3…    26
## 3 GO:0098813 nuclear ch… 33/151  321/18… 2.64e-27 9.86e-25 8.70e-25 PTTG1…    33
## 4 GO:0140014 mitotic nu… 37/151  325/18… 2.80e-32 3.71e-29 3.27e-29 PTTG1…    37
## 5 GO:0007059 chromosome… 39/151  382/18… 3.31e-32 3.71e-29 3.27e-29 PTTG1…    39
## 6 GO:0000280 nuclear di… 42/151  481/18… 6.20e-32 4.64e-29 4.09e-29 PTTG1…    42
## # … with abbreviated variable name ¹​GeneRatio

Automated cluster annotation

Method 1- scCATCH

hg19_geneinfo <- rev_gene(data = scRNA_combined[['integrated']]@data,
                          data_type = "data",
                          species="Human",
                          geneinfo = geneinfo)

hg19_ann_1 <- createscCATCH(data = scRNA_combined[['integrated']]@data, 
                          cluster = as.character(Idents(scRNA_combined)))

hg19_ann_1 <- findmarkergene(object = hg19_ann_1, species = "Human", marker = cellmatch, tissue = "Breast", cancer="Breast Cancer", use_method = "2")
## There are 535 potential marker genes in CellMatch database for Human Breast Cancer on Breast.
hg19_ann_1 <- findcelltype(hg19_ann_1)
hg19_ann_1@celltype 
##   cluster                                                 cluster_marker
## 1       0 G0S2, FN1, IGFBP5, ITGA6, TACSTD2, LIMA1, PMEPA1, DNAJB1, CD44
## 2       3                                                         IGFBP5
## 3       2                                     CENPF, BIRC5, HELLS, BLVRB
## 4       1                                                           BST2
## 5       4                                                   CENPF, BIRC5
##          cell_type celltype_score celltype_related_marker
## 1 Cancer Stem Cell           0.98             CD44, ITGA6
## 2      Killer Cell           0.50                  IGFBP5
## 3    Helper T Cell           0.61     CENPF, BIRC5, HELLS
## 4           T Cell           0.50                    BST2
## 5    Helper T Cell           0.58            CENPF, BIRC5
##                                                                                                                                                                                                                                                                                                                                                                                                   PMID
## 1 24099815, 28986882, 27840965, 23799994, 23053657, 23768049, 28719381, 28579829, 28554844, 28399903, 28166194, 28148288, 28036386, 28012234, 27768764, 27630305, 27458252, 27133070, 27115469, 26893363, 26170011, 25990436, 24596390, 24297508, 24171482, 24144739, 24055090, 23543868, 23112116, 23056395, 23036368, 22761812, 22464177, 21979823, 21802218, 21092249, 22459349, 27640754, 25940879
## 2                                                                                                                                                                                                                                                                                                                                                                                             28474673
## 3                                                                                                                                                                                                                                                                                                                                                                                             28474673
## 4                                                                                                                                                                                                                                                                                                                                                                                             28474673
## 5                                                                                                                                                                                                                                                                                                                                                                                             28474673

Method 2- SingleR

hpca.se <- HumanPrimaryCellAtlasData()
## Warning: 'HumanPrimaryCellAtlasData' is deprecated.
## Use 'celldex::HumanPrimaryCellAtlasData' instead.
## See help("Deprecated")
## snapshotDate(): 2022-10-31
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
hpca.se
## class: SummarizedExperiment 
## dim: 19363 713 
## metadata(0):
## assays(1): logcounts
## rownames(19363): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(713): GSM112490 GSM112491 ... GSM92233 GSM92234
## colData names(3): label.main label.fine label.ont
hg19_ann_2 <- SingleR(test = scRNA_combined[['integrated']]@data, 
                      ref = hpca.se, 
                      assay.type.test=1,
                      labels = hpca.se$label.main, clusters = scRNA_combined@meta.data[["seurat_clusters"]])

hg19_ann_2$labels
## [1] "Epithelial_cells" "Epithelial_cells" "BM & Prog."       "Epithelial_cells"
## [5] "BM & Prog."

Cell population labeling

levels(scRNA_combined)
## [1] "0" "1" "2" "3" "4"
new_clusters_hg19 <- c("Fibroblasts","T-cells","Helper-T-Cells 1","NK cells","Helper-T-Cells 2")

names(new_clusters_hg19) <- levels(scRNA_combined)

scRNA_combined <- RenameIdents(scRNA_combined, new_clusters_hg19)

DimPlot(scRNA_combined, reduction="umap", label=T, label.box =T) 

Analysis of mouse cells

Data integration

# Normalize and identify variable features
scRNA_files_mm10 <- lapply(X = scRNA_files_mm10, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# Select features repeatedly variable between the 4 datasets
features_mm10 <- SelectIntegrationFeatures(object.list = scRNA_files_mm10)

# Select anchors for integration
brca_anchors_mm10 <- FindIntegrationAnchors(object.list = scRNA_files_mm10, anchor.features = features_mm10)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2294 anchors
## Filtering anchors
##  Retained 2030 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2088 anchors
## Filtering anchors
##  Retained 1826 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2833 anchors
## Filtering anchors
##  Retained 2294 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2231 anchors
## Filtering anchors
##  Retained 2108 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2983 anchors
## Filtering anchors
##  Retained 2716 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 2769 anchors
## Filtering anchors
##  Retained 2515 anchors
# Create the integrated data assay
scRNA_combined_mm10 <- IntegrateData(anchorset=brca_anchors_mm10)
## Merging dataset 1 into 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 2 into 4 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 3 into 4 1 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data

Quality control

# Use the integrated assay as default
DefaultAssay(scRNA_combined_mm10) <- "RNA"

# Creating a new slot with the percentage of counts on mitochondrial reads


scRNA_combined_mm10[["percent.mt"]] <- PercentageFeatureSet(scRNA_combined_mm10, pattern="mt-")

# Violin plots of specific features
VlnPlot(scRNA_combined_mm10, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3,
        split.by="orig.ident")

plot_qc_mm10_1 <- VlnPlot(scRNA_combined_mm10, 
        features = c("nFeature_RNA"),
        split.by="orig.ident") +
  geom_hline(yintercept=c(200,6000), linetype="dashed", color = "firebrick4") + ylim(0,7000)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
plot_qc_mm10_2 <-  VlnPlot(scRNA_combined_mm10, 
        features = c("percent.mt"),
        split.by="orig.ident") +
  geom_hline(yintercept=10, linetype="dashed", color = "firebrick4") + ylim(0,50)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

We will remove cells with a percentage of mitochondrial reads too high (>5%) and with an abnormally high or low number of feature counts, that can suggest empty droplets or multiplets in the microfluidic system, respectively.

scRNA_combined_mm10 <- subset(scRNA_combined_mm10, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 10)

VlnPlot(scRNA_combined_mm10, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3,
        split.by="orig.ident")

Visualization and clustering

Check integration

# First let's do the same analysis without integration
scRNA_combined_mm10 <- FindVariableFeatures(scRNA_combined_mm10)
scRNA_combined_mm10 <- ScaleData(scRNA_combined_mm10, verbose = FALSE)
scRNA_combined_mm10 <- RunPCA(scRNA_combined_mm10, npcs = 30, verbose = FALSE)
scRNA_combined_mm10 <- RunUMAP(scRNA_combined_mm10, reduction = "pca", dims = 1:30)
## 17:22:23 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:22:23 Read 4017 rows and found 30 numeric columns
## 17:22:23 Using Annoy for neighbor search, n_neighbors = 30
## 17:22:23 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:22:24 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f06908d22
## 17:22:24 Searching Annoy index using 1 thread, search_k = 3000
## 17:22:25 Annoy recall = 100%
## 17:22:26 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:22:27 Initializing from normalized Laplacian + noise (using irlba)
## 17:22:27 Commencing optimization for 500 epochs, with 158874 positive edges
## 17:22:40 Optimization finished
scRNA_combined_mm10 <- FindNeighbors(scRNA_combined_mm10, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined_mm10 <- FindClusters(scRNA_combined_mm10, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4017
## Number of edges: 135529
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9071
## Number of communities: 14
## Elapsed time: 0 seconds
p1_mm10 <- DimPlot(scRNA_combined_mm10, reduction = "umap", group.by = "orig.ident") + ggtitle("mm10 pre-integration")
p2 <- DimPlot(scRNA_combined_mm10, reduction = "umap", label = TRUE, repel = TRUE)
p1_mm10 + p2

DimPlot(scRNA_combined_mm10, reduction = "umap", split.by = "orig.ident")

# Change the default to the integrated dataset
DefaultAssay(scRNA_combined_mm10) <- "integrated"

scRNA_combined_mm10 <- ScaleData(scRNA_combined_mm10, verbose = FALSE)
scRNA_combined_mm10 <- RunPCA(scRNA_combined_mm10, npcs = 30, verbose = FALSE)
scRNA_combined_mm10 <- RunUMAP(scRNA_combined_mm10, reduction = "pca", dims = 1:30)
## 17:22:46 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:22:46 Read 4017 rows and found 30 numeric columns
## 17:22:46 Using Annoy for neighbor search, n_neighbors = 30
## 17:22:46 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:22:47 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f05b526cf2
## 17:22:47 Searching Annoy index using 1 thread, search_k = 3000
## 17:22:48 Annoy recall = 100%
## 17:22:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:22:51 Initializing from normalized Laplacian + noise (using irlba)
## 17:22:51 Commencing optimization for 500 epochs, with 167402 positive edges
## 17:23:11 Optimization finished
scRNA_combined_mm10 <- FindNeighbors(scRNA_combined_mm10, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined_mm10 <- FindClusters(scRNA_combined_mm10, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4017
## Number of edges: 155284
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9020
## Number of communities: 13
## Elapsed time: 0 seconds
# Visualization
p2_mm10 <- DimPlot(scRNA_combined_mm10, reduction = "umap", group.by = "orig.ident") + ggtitle("mm10 post-integration")
p3 <- DimPlot(scRNA_combined_mm10, reduction = "umap", label = TRUE, repel = TRUE)
p2_mm10 + p3

DimPlot(scRNA_combined_mm10, reduction = "umap", split.by = "orig.ident")

After normalization and integration, the UMAPs of all subsets look quite similar, so we can assume integration is correct and we can proceed forward. Once this has been verified, we can go back a few steps and refine our analysis

Determine the dimensionality of the dataset

scRNA_combined_mm10 <- JackStraw(scRNA_combined_mm10, num.replicate = 100)
scRNA_combined_mm10 <- ScoreJackStraw(scRNA_combined_mm10, dims=1:20)

JackStrawPlot(scRNA_combined_mm10, dims=1:20)
## Warning: Removed 28000 rows containing missing values (`geom_point()`).

ElbowPlot(scRNA_combined_mm10)

Based on these two graphs, we may consider using 10 PCs.

scRNA_combined_mm10 <- FindNeighbors(scRNA_combined_mm10, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
scRNA_combined_mm10 <- FindClusters(scRNA_combined_mm10, resolution = 0.6) # Try different levels of resolution and chose the one that results in cluster with most biological sense1
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4017
## Number of edges: 155284
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8901
## Number of communities: 14
## Elapsed time: 0 seconds
scRNA_combined_mm10 <- RunUMAP(scRNA_combined_mm10, dims = 1:10)
## 17:25:29 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:25:29 Read 4017 rows and found 10 numeric columns
## 17:25:29 Using Annoy for neighbor search, n_neighbors = 30
## 17:25:29 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:25:30 Writing NN index file to temp file C:\Users\alba_\AppData\Local\Temp\Rtmps7vENO\file42f064d65142
## 17:25:30 Searching Annoy index using 1 thread, search_k = 3000
## 17:25:31 Annoy recall = 100%
## 17:25:32 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:25:34 Initializing from normalized Laplacian + noise (using irlba)
## 17:25:34 Commencing optimization for 500 epochs, with 160050 positive edges
## 17:25:46 Optimization finished
DimPlot(scRNA_combined_mm10, reduction="umap")

scRNA_combined_mm10$orig.ident <- factor(x = scRNA_combined_mm10$orig.ident, levels = c("HBCx-95", "HBCx-95_CAPAR","HBCx-22","HBCx22-TAMR"))

plot_mm10_clustering <- DimPlot(scRNA_combined_mm10, reduction = "umap", split.by = "orig.ident",ncol=2) + ggtitle("Clustering in mm10 cells") + theme(plot.title = element_text(hjust=0.5))

Finding marker genes

mm10_markers <- FindAllMarkers(scRNA_combined_mm10, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
mm10_markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 28 × 7
## # Groups:   cluster [14]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene 
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>
##  1 9.24e-198       2.83 0.84  0.692 1.85e-194 0       Gzme 
##  2 1.97e-192       2.76 0.949 0.742 3.93e-189 0       Spp1 
##  3 0               4.12 0.993 0.736 0         1       C1qa 
##  4 0               3.83 0.994 0.657 0         1       C1qb 
##  5 2.20e-154       2.76 0.901 0.72  4.39e-151 2       Sfrp4
##  6 1.77e-204       2.67 0.973 0.641 3.53e-201 2       Apod 
##  7 4.81e- 97       2.93 0.909 0.617 9.61e- 94 3       Mmp3 
##  8 6.67e- 11       2.66 0.682 0.654 1.33e-  7 3       Saa3 
##  9 1.51e-168       3.23 0.99  0.409 3.02e-165 4       Sbsn 
## 10 4.39e-142       3.21 0.946 0.569 8.78e-139 4       Dmkn 
## # … with 18 more rows
mm10_markers %>%
    group_by(cluster) %>%
    top_n(n = 5, wt = avg_log2FC) -> top5
plot_mm10_heatmap <-DoHeatmap(scRNA_combined_mm10, features = top5$gene) + NoLegend()

Functional analysis

Gene set enrichment analysis

MSigDB_df <- msigdbr(species = "Homo sapiens")
fgsea_sets<- MSigDB_df %>% split(x = .$gene_symbol, f = .$gs_name)



plot_GSEA <- function(cluster_number){
  genes <- mm10_markers %>% filter(cluster==as.character(cluster_number)) %>%
  arrange(desc(p_val_adj)) %>% 
  dplyr::select(gene, avg_log2FC)
  
  # Create dataframe with genes
  ranks <- deframe(genes)
  
  # Perform GSEA analysis
  fgseaRes <- fgsea(fgsea_sets,
                    stats = ranks,
                    minSize=10,
                    maxSize=500,
                    nperm=1000000)
  
  # Plot
  ggplot(fgseaRes %>% filter(padj < 0.01) %>% head(n= 20), aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill= NES < 7.5)) +
  coord_flip() +
  labs(x="Pathway",
       y="Normalized Enrichment Score",
       title=paste0("Hallmark pathways NES from GSEA for cluster",as.character(cluster_number)))
}

plot_GSEA(0)

for (cluster in 0:5){plot_GSEA(cluster)}

Gene ontology

plot_GO_mm10 <- function(cluster_number){
  genes <- mm10_markers %>% filter(cluster==as.character(cluster_number))
  
  # Calculate enrichment
  enrichment <- enrichGO(gene = genes$gene,
             OrgDb  = org.Mm.eg.db,
             keyType = "SYMBOL",
             ont  = "BP",
             pAdjustMethod = "BH",
             pvalueCutoff  = 0.01,
             qvalueCutoff  = 0.05)
  enrichment <- clusterProfiler::simplify(enrichment, cutoff=0.7, by="p.adjust", select_fun=min)
  
  GO_ggdata <- enrichment %>%
   as_data_frame() %>%
   arrange(Count)
  GO_ggdata$Description <- factor(GO_ggdata$Description, levels = GO_ggdata$Description)
  print(tail(GO_ggdata))
  
ggplot(GO_ggdata, aes(x = Description, y = Count, fill = p.adjust)) +
 geom_bar(stat = "identity") +
 scale_colour_viridis_d(begin=0,end=1) +
 coord_flip() +
 ylab("Number of genes") +
 xlab("GO Terms") +
 theme(axis.text.y = element_text(size=10))
}

for (cluster in 0:13){print(plot_GO_mm10(cluster))}
## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0060537 muscle tis… 28/303  494/28… 5.36e-13 8.08e-11 4.96e-11 Csrp2…    28
## 2 GO:0001667 ameboidal-… 32/303  471/28… 6.39e-17 2.89e-14 1.78e-14 Timp1…    32
## 3 GO:0031589 cell-subst… 36/303  371/28… 4.27e-24 3.09e-21 1.90e-21 Col8a…    36
## 4 GO:0030198 extracellu… 48/303  322/28… 8.07e-41 1.32e-37 8.10e-38 Col8a…    48
## 5 GO:0043062 extracellu… 48/303  323/28… 9.40e-41 1.32e-37 8.10e-38 Col8a…    48
## 6 GO:0045229 external e… 48/303  324/28… 1.09e-40 1.32e-37 8.10e-38 Col8a…    48
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0007159 leukocyte … 39/240  411/28… 1.37e-29 2.45e-26 1.51e-26 H2-Eb…    39
## 2 GO:0022407 regulation… 39/240  497/28… 1.74e-26 5.67e-24 3.49e-24 H2-Eb…    39
## 3 GO:0032103 positive r… 40/240  459/28… 6.65e-29 7.93e-26 4.88e-26 Ctss/…    40
## 4 GO:0002697 regulation… 40/240  464/28… 1.01e-28 9.04e-26 5.56e-26 Fcer1…    40
## 5 GO:0002449 lymphocyte… 40/240  483/28… 4.74e-28 2.92e-25 1.80e-25 C1qa/…    40
## 6 GO:0002460 adaptive i… 42/240  486/28… 3.43e-30 1.23e-26 7.56e-27 C1qa/…    42
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0001667 ameboidal-… 28/266  471/28… 6.33e-15 1.41e-12 7.99e-13 Dcn/H…    28
## 2 GO:0050900 leukocyte … 29/266  385/28… 3.82e-18 2.41e-15 1.37e-15 Tnfai…    29
## 3 GO:0042060 wound heal… 33/266  380/28… 1.92e-22 1.81e-19 1.03e-19 Serpi…    33
## 4 GO:0030198 extracellu… 38/266  322/28… 1.12e-30 1.78e-27 1.01e-27 Dpt/H…    38
## 5 GO:0043062 extracellu… 38/266  323/28… 1.26e-30 1.78e-27 1.01e-27 Dpt/H…    38
## 6 GO:0045229 external e… 38/266  324/28… 1.41e-30 1.78e-27 1.01e-27 Dpt/H…    38
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0031589 cell-subst… 30/259  371/28… 6.17e-20 2.55e-17 1.40e-17 Itgbl…    30
## 2 GO:0050678 regulation… 30/259  442/28… 8.29e-18 2.57e-15 1.40e-15 Sfrp1…    30
## 3 GO:0042060 wound heal… 32/259  380/28… 9.67e-22 4.49e-19 2.46e-19 Serpi…    32
## 4 GO:0030198 extracellu… 46/259  322/28… 1.70e-41 2.83e-38 1.55e-38 Mfap4…    46
## 5 GO:0043062 extracellu… 46/259  323/28… 1.97e-41 2.83e-38 1.55e-38 Mfap4…    46
## 6 GO:0045229 external e… 46/259  324/28… 2.28e-41 2.83e-38 1.55e-38 Mfap4…    46
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0002683 negative r… 20/209  465/28… 2.18e-10 3.91e- 8 2.44e- 8 Cd55/…    20
## 2 GO:0001667 ameboidal-… 22/209  471/28… 5.24e-12 2.82e- 9 1.76e- 9 Sema3…    22
## 3 GO:0042060 wound heal… 29/209  380/28… 3.07e-21 2.48e-18 1.54e-18 Serpi…    29
## 4 GO:0030198 extracellu… 32/209  322/28… 6.06e-27 7.93e-24 4.94e-24 Dpt/H…    32
## 5 GO:0043062 extracellu… 32/209  323/28… 6.68e-27 7.93e-24 4.94e-24 Dpt/H…    32
## 6 GO:0045229 external e… 32/209  324/28… 7.37e-27 7.93e-24 4.94e-24 Dpt/H…    32
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0010563 negative r… 34/417  453/28… 5.22e-15 6.24e-13 3.63e-13 Pecam…    34
## 2 GO:0045936 negative r… 34/417  453/28… 5.22e-15 6.24e-13 3.63e-13 Pecam…    34
## 3 GO:0040013 negative r… 38/417  360/28… 1.04e-21 3.55e-19 2.06e-19 Egfl7…    38
## 4 GO:0090132 epithelium… 46/417  330/28… 2.03e-31 1.79e-28 1.04e-28 Cdh5/…    46
## 5 GO:0045765 regulation… 47/417  308/28… 6.40e-34 1.42e-30 8.23e-31 Cdh5/…    47
## 6 GO:0001667 ameboidal-… 57/417  471/28… 2.05e-35 9.06e-32 5.27e-32 Cdh5/…    57
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0060537 muscle tis… 24/237  494/28… 3.57e-12 9.98e-10 6.72e-10 Tnnt2…    24
## 2 GO:0001667 ameboidal-… 26/237  471/28… 2.23e-14 1.07e-11 7.19e-12 Acta2…    26
## 3 GO:0042060 wound heal… 28/237  380/28… 1.22e-18 1.02e-15 6.88e-16 Serpi…    28
## 4 GO:0030198 extracellu… 31/237  322/28… 5.33e-24 7.20e-21 4.84e-21 Col15…    31
## 5 GO:0043062 extracellu… 31/237  323/28… 5.86e-24 7.20e-21 4.84e-21 Col15…    31
## 6 GO:0045229 external e… 31/237  324/28… 6.43e-24 7.20e-21 4.84e-21 Col15…    31
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0002697 regulation… 38/245  464/28… 3.75e-26 8.39e-24 5.45e-24 Il1b/…    38
## 2 GO:0030098 lymphocyte… 38/245  471/28… 6.46e-26 1.33e-23 8.63e-24 Il1b/…    38
## 3 GO:0050900 leukocyte … 39/245  385/28… 2.54e-30 1.39e-27 9.05e-28 Il1b/…    39
## 4 GO:0050863 regulation… 40/245  378/28… 7.72e-32 5.08e-29 3.30e-29 Il1b/…    40
## 5 GO:1903037 regulation… 43/245  371/28… 6.61e-36 1.09e-32 7.06e-33 Il1b/…    43
## 6 GO:0007159 leukocyte … 50/245  411/28… 7.54e-43 2.48e-39 1.61e-39 Il1b/…    50
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description    GeneR…¹ BgRatio  pvalue p.adj…²  qvalue geneID Count
##   <chr>      <fct>          <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>  <int>
## 1 GO:1901342 regulation of… 12/127  312/28… 1.42e-8 5.06e-6 3.43e-6 Pkm/P…    12
## 2 GO:0042060 wound healing  12/127  380/28… 1.22e-7 2.08e-5 1.41e-5 Lox/A…    12
## 3 GO:0042692 muscle cell d… 12/127  449/28… 7.22e-7 5.68e-5 3.85e-5 Krt19…    12
## 4 GO:0002683 negative regu… 12/127  465/28… 1.04e-6 7.51e-5 5.09e-5 Mif/S…    12
## 5 GO:0010876 lipid localiz… 12/127  470/28… 1.16e-6 8.17e-5 5.54e-5 Mif/S…    12
## 6 GO:0060537 muscle tissue… 13/127  494/28… 2.90e-7 3.76e-5 2.55e-5 Ankrd…    13
## # … with abbreviated variable names ¹​GeneRatio, ²​p.adjust

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:1903037 regulation… 27/195  371/28… 4.23e-20 1.56e-17 1.04e-17 Cd160…    27
## 2 GO:0070661 leukocyte … 27/195  384/28… 1.03e-19 3.07e-17 2.05e-17 Sh2d2…    27
## 3 GO:0051251 positive r… 27/195  465/28… 1.35e-17 2.34e-15 1.56e-15 Cd160…    27
## 4 GO:0050863 regulation… 30/195  378/28… 2.43e-23 2.39e-20 1.59e-20 Rac2/…    30
## 5 GO:0007159 leukocyte … 31/195  411/28… 1.96e-23 2.39e-20 1.59e-20 Rac2/…    31
## 6 GO:0030098 lymphocyte… 35/195  471/28… 3.51e-26 1.04e-22 6.91e-23 Cd3g/…    35
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0140694 non-membra… 28/212  379/28… 5.49e-20 1.03e-17 8.20e-18 Birc5…    28
## 2 GO:0044772 mitotic ce… 31/212  451/28… 4.02e-21 8.12e-19 6.46e-19 Birc5…    31
## 3 GO:0140014 mitotic nu… 33/212  309/28… 1.38e-28 9.06e-26 7.21e-26 Birc5…    33
## 4 GO:0098813 nuclear ch… 35/212  306/28… 2.44e-31 3.21e-28 2.56e-28 Birc5…    35
## 5 GO:0007059 chromosome… 40/212  369/28… 7.52e-35 1.98e-31 1.57e-31 Ska1/…    40
## 6 GO:0000280 nuclear di… 40/212  478/28… 2.08e-30 1.82e-27 1.45e-27 Birc5…    40
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0002683 negative r… 28/208  465/28… 7.17e-18 9.76e-16 5.97e-16 Pla2g…    28
## 2 GO:0002460 adaptive i… 28/208  486/28… 2.25e-17 2.85e-15 1.74e-15 Batf/…    28
## 3 GO:0030595 leukocyte … 30/208  228/28… 4.79e-29 8.47e-26 5.18e-26 Ccl6/…    30
## 4 GO:0032103 positive r… 30/208  459/28… 4.34e-20 1.03e-17 6.27e-18 Osm/C…    30
## 5 GO:0070661 leukocyte … 31/208  384/28… 1.90e-23 9.62e-21 5.89e-21 Pla2g…    31
## 6 GO:0050900 leukocyte … 36/208  385/28… 2.16e-29 7.65e-26 4.68e-26 Ccl6/…    36
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0070661 leukocyte … 34/274  384/28… 6.00e-23 1.11e-19 7.09e-20 Tnfsf…    34
## 2 GO:0007159 leukocyte … 34/274  411/28… 5.37e-22 3.78e-19 2.42e-19 Selpl…    34
## 3 GO:0002449 lymphocyte… 34/274  483/28… 8.93e-20 1.65e-17 1.06e-17 C1qc/…    34
## 4 GO:0002460 adaptive i… 34/274  486/28… 1.08e-19 1.74e-17 1.11e-17 C1qc/…    34
## 5 GO:0032103 positive r… 35/274  459/28… 1.85e-21 9.45e-19 6.05e-19 Hmgb2…    35
## 6 GO:0002697 regulation… 36/274  464/28… 2.65e-22 2.33e-19 1.49e-19 Rac2/…    36
## # … with abbreviated variable name ¹​GeneRatio

## # A tibble: 6 × 9
##   ID         Description GeneR…¹ BgRatio   pvalue p.adjust   qvalue geneID Count
##   <chr>      <fct>       <chr>   <chr>      <dbl>    <dbl>    <dbl> <chr>  <int>
## 1 GO:0070661 leukocyte … 22/133  384/28… 5.41e-18 1.15e-15 7.38e-16 Ccl5/…    22
## 2 GO:0002683 negative r… 22/133  465/28… 3.01e-16 3.85e-14 2.46e-14 Id2/L…    22
## 3 GO:0051251 positive r… 22/133  465/28… 3.01e-16 3.85e-14 2.46e-14 Ccl5/…    22
## 4 GO:0030098 lymphocyte… 22/133  471/28… 3.92e-16 4.78e-14 3.06e-14 Cd3g/…    22
## 5 GO:0007159 leukocyte … 23/133  411/28… 1.50e-18 4.78e-16 3.06e-16 Ccl5/…    23
## 6 GO:0002449 lymphocyte… 24/133  483/28… 3.82e-18 9.00e-16 5.76e-16 Nkg7/…    24
## # … with abbreviated variable name ¹​GeneRatio

Automated cluster annotation

Method 1- scCATCH

MouseRNAseqData(ensembl = FALSE, cell.ont = c("all", "nonna", "none"))
## Cannot connect to ExperimentHub server, using 'localHub=TRUE' instead
## Using 'localHub=TRUE'
##   If offline, please also see BiocManager vignette section on offline use
## snapshotDate(): 2023-01-13
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## class: SummarizedExperiment 
## dim: 21214 358 
## metadata(0):
## assays(1): logcounts
## rownames(21214): Xkr4 Rp1 ... LOC100039574 LOC100039753
## rowData names(0):
## colnames(358): ERR525589Aligned ERR525592Aligned ... SRR1044043Aligned
##   SRR1044044Aligned
## colData names(3): label.main label.fine label.ont
mm10_geneinfo <- rev_gene(data = scRNA_combined_mm10[['integrated']]@data,
                          data_type = "data",
                          species="Mouse",
                          geneinfo = geneinfo)

mm10_ann_1 <- createscCATCH(data = scRNA_combined_mm10[['integrated']]@data, 
                          cluster = as.character(Idents(scRNA_combined_mm10)))

mm10_ann_1 <- findmarkergene(object = mm10_ann_1, species = "Mouse", marker = cellmatch, tissue="Mammary gland", use_method = "2")
## There are 380 potential marker genes in CellMatch database for Mouse on Mammary gland.
mm10_ann_1 <- findcelltype(mm10_ann_1)
mm10_ann_1@celltype
##    cluster
## 1        4
## 2        5
## 3        0
## 4        7
## 5        1
## 6        2
## 7        6
## 8        3
## 9        9
## 10      11
## 11       8
## 12      13
## 13      12
## 14      10
##                                                                                                                                                                                                                                                                                                                                                                                                                    cluster_marker
## 1                                                                                             Igfbp5, Apod, Mt1, Clec3b, Gsn, Aldh1a3, Dcn, Plet1, Serpinb1a, Igfbp4, Efemp1, Procr, Igfbp6, Rarres2, Col1a2, Fn1, Htra3, S100a4, Ogn, Dpt, Ly6c1, Col3a1, Lum, Gadd45g, Ctsl, Mfap5, Serping1, Glul, Gpx3, Ly6a, Dpep1, Serpinf1, Ctsk, Pcolce, Loxl1, Fstl1, Phlda1, Plpp3, S100a6, Aebp1, Pdpn, C1s1, Mmp2, Lsp1, Anxa1, Anxa2
## 2                                                                                                                                                                                                               Fabp4, Ptn, Gng11, Lrg1, Col4a2, Atf3, Procr, Col4a1, Ly6c1, Id3, Rbp1, Fscn1, Crip2, Sparc, Csrp1, Tspan13, Ly6a, Cdkn1a, Tsc22d1, Kit, Itga6, Cd200, Gata2, Cnn3, Slc12a2, Igfbp7, S100a6, Serpinh1, Vim, Anxa2
## 3  Actg2, Igfbp5, Cxcl14, Gng11, Acta2, Igfbp2, Tagln, Ctsd, Myl9, Postn, Tpm2, Igfbp4, Igfbp6, Tpm1, Itm2a, Col6a3, Col1a1, Col1a2, Fn1, Col3a1, Rbp1, Gadd45g, Fscn1, Crip2, Col6a1, Mfap5, Sparc, Serping1, Csrp1, Mfge8, Col6a2, Gpx3, Bgn, Lgals1, Tsc22d1, Serpinf1, Col5a2, Ctsk, Cd200, Cnn3, Pcolce, Mfap2, Loxl1, Fstl1, Mmp14, Gadd45b, Igfbp7, S100a6, Rcn3, Serpinh1, Aebp1, Vim, Tmem176a, Mmp2, Lsp1, Anxa1, Anxa2
## 4                                                                                                                                                                                                                                                               Ccl5, Hp, H2-Ab1, Lyz2, Ccl6, H2-Aa, Ctss, Cd14, Atf3, Cd68, H2-Eb1, AW112010, S100a4, Cd24a, Fcer1g, Cfp, Trf, Aif1, Tyrobp, Ms4a6c, Tspan13, Cdkn1a, Ctsc, Tgm2
## 5                                                                                                                                                                                                                                                                Ccl5, H2-Ab1, Lyz2, Ccl6, H2-Aa, Ctss, C1qc, Ctsd, Cd14, Cd68, H2-Eb1, AW112010, Fcer1g, Trf, Aif1, Tyrobp, Ms4a6c, Tspan13, Cdkn1a, Ctsc, Tgm2, Ctsb, Lgmn, Grn
## 6                   Cxcl14, Apod, Mt1, Lcn2, Lpl, Clec3b, Gsn, Dcn, Serpinb1a, Igfbp4, Efemp1, Procr, Igfbp6, Col4a1, Rarres2, Col6a3, Col1a1, Col1a2, Fn1, Htra3, Ogn, Dpt, Ly6c1, Col3a1, Id3, Lum, Gadd45g, Ctsl, Col6a1, Mfap5, Sparc, Serping1, Glul, Col6a2, Gpx3, Ly6a, Dpep1, Bgn, Serpinf1, Col5a2, Ctsk, Pcolce, Mfap2, Loxl1, Fstl1, Phlda1, Plpp3, S100a6, Rcn3, Serpinh1, Aebp1, Pdpn, Dbi, C1s1, Mmp2, Anxa1, Anxa2
## 7                                                                                                                                                         Actg2, Gng11, Acta2, Tagln, Mylk, Myl9, Cnn1, Col4a2, Postn, Tpm2, Tpm1, Col4a1, Rarres2, Col6a3, Col1a1, Col1a2, Fn1, S100a4, Col3a1, Crip2, Sparc, Csrp1, Mfge8, Bgn, Lgals1, Col5a2, Pcolce, Fstl1, Igfbp7, Cystm1, S100a6, Rcn3, Serpinh1, Aebp1, Vim, Anxa1, Anxa2
## 8                           Hp, Cxcl14, Apod, Mt1, Clec3b, Dcn, Postn, Igfbp4, Igfbp6, Itm2a, Rarres2, Col1a1, Col1a2, Fn1, Ogn, Dpt, Col3a1, Rbp1, Lum, Gadd45g, Ctsl, Col6a1, Mfap5, Sparc, Serping1, Glul, Mfge8, Col6a2, Gpx3, Ly6a, Dpep1, Bgn, Lgals1, Tsc22d1, Serpinf1, Col5a2, Ctsk, Pcolce, Mfap2, Loxl1, Fstl1, Mmp14, Phlda1, S100a6, Rcn3, Serpinh1, Aebp1, Pdpn, Dbi, Vim, C1s1, Tmem176a, Mmp2, Lsp1, Anxa1, Anxa2
## 9                                                                                                                                                                                                                                                                                                                                                                                     Ccl5, Nkg7, AW112010, S100a4, Tspan13, Lsp1
## 10                                                                                                                                                                                                                         H2-Ab1, Lyz2, Ccl6, H2-Aa, Pf4, Mt1, Ctss, C1qc, Ctsd, Cd14, Atf3, Cd68, H2-Eb1, AW112010, S100a4, Fcer1g, Cfp, Trf, Aif1, Tyrobp, Ms4a6c, Fxyd2, Cdkn1a, Ctsc, Ctsb, Cystm1, Dbi, Lgmn, Tmem176a, Grn
## 11                                                                                                                                                                                                                                                                             Actg2, Gng11, Acta2, Mt1, Tagln, Myl9, Cnn1, Postn, Tpm2, Tpm1, Col6a3, Fn1, Crip2, Csrp1, Bgn, Lgals1, Serpinf1, Col5a2, Cnn3, S100a6, Vim, Anxa1
## 12                                                                                                                                                                                                                                                                        Ccl5, H2-Ab1, Lyz2, Nkg7, H2-Aa, Ctss, C1qc, Ctsd, Cd14, Cd68, H2-Eb1, AW112010, Fcer1g, Trf, Aif1, Tyrobp, Ms4a6c, Cdkn1a, Ctsc, Tgm2, Ctsb, Lgmn, Grn
## 13                                                                                                                                                                                                                                                             H2-Ab1, Lyz2, H2-Aa, Ctss, C1qc, Ctsd, Cd14, Cd68, H2-Eb1, AW112010, Fcer1g, Cfp, Trf, Tuba1b, Aif1, Tyrobp, Ms4a6c, Cdkn1a, Ctsc, Tgm2, Ctsb, Lgmn, Tmem176a, Grn
## 14                                                                                                                             Actg2, Gng11, Acta2, Tagln, Myl9, Dcn, Postn, Tpm2, Tpm1, Col4a1, Col6a3, Col1a1, Col1a2, Fn1, S100a4, Col3a1, Tuba1b, Rbp1, Lum, Crip2, Col6a1, Sparc, Csrp1, Mfge8, Col6a2, Bgn, Lgals1, Serpinf1, Col5a2, Cnn3, Pcolce, Loxl1, Fstl1, Igfbp7, Phlda1, S100a6, Rcn3, Serpinh1, Vim, Anxa1, Anxa2
##                    cell_type celltype_score
## 1    Luminal Progenitor Cell           0.83
## 2    Luminal Progenitor Cell           0.88
## 3         Myoepithelial Cell           0.78
## 4    Luminal Progenitor Cell           0.79
## 5    Luminal Progenitor Cell           0.73
## 6    Luminal Progenitor Cell           0.81
## 7         Myoepithelial Cell           0.79
## 8    Luminal Progenitor Cell           0.82
## 9  Killer Cell, Luminal Cell     0.58, 0.58
## 10   Luminal Progenitor Cell           0.73
## 11        Myoepithelial Cell           0.77
## 12   Luminal Progenitor Cell           0.73
## 13   Luminal Progenitor Cell           0.73
## 14        Myoepithelial Cell           0.78
##                                                                      celltype_related_marker
## 1                                                Aldh1a3, Plet1, Igfbp5, Glul, Anxa2, Phlda1
## 2                                                    Kit, Lrg1, Slc12a2, Ptn, Tsc22d1, Anxa2
## 3        Acta2, Tagln, Actg2, Igfbp2, Cxcl14, Myl9, Tpm2, Tpm1, Csrp1, Rbp1, Mfge8, Serpinh1
## 4                                                                                 Cd14, Tgm2
## 5                                                                                 Cd14, Tgm2
## 6                                                             Glul, Lcn2, Dbi, Anxa2, Phlda1
## 7  Acta2, Tagln, Actg2, Cnn1, Mylk, Myl9, Tpm2, Tpm1, Csrp1, Mfge8, Col4a1, Col4a2, Serpinh1
## 8                                                   Mfge8, Glul, Tsc22d1, Dbi, Anxa2, Phlda1
## 9                                                              AW112010, Ccl5, Tspan13, Nkg7
## 10                                                                                 Cd14, Dbi
## 11                                        Acta2, Tagln, Actg2, Cnn1, Myl9, Tpm2, Tpm1, Csrp1
## 12                                                                                Cd14, Tgm2
## 13                                                                                Cd14, Tgm2
## 14               Acta2, Tagln, Actg2, Myl9, Tpm2, Tpm1, Csrp1, Rbp1, Mfge8, Col4a1, Serpinh1
##                            PMID
## 1            29225342, 29775597
## 2  21132007, 29225342, 29775597
## 3            29225342, 29775597
## 4            29225342, 29775597
## 5            29225342, 29775597
## 6                      29775597
## 7            29225342, 29775597
## 8                      29775597
## 9                      29775597
## 10           29225342, 29775597
## 11           29225342, 29775597
## 12           29225342, 29775597
## 13           29225342, 29775597
## 14           29225342, 29775597

Method 2- SingleR

mRNAseq.se <- MouseRNAseqData()
## Cannot connect to ExperimentHub server, using 'localHub=TRUE' instead
## Using 'localHub=TRUE'
##   If offline, please also see BiocManager vignette section on offline use
## snapshotDate(): 2023-01-13
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
mRNAseq.se
## class: SummarizedExperiment 
## dim: 21214 358 
## metadata(0):
## assays(1): logcounts
## rownames(21214): Xkr4 Rp1 ... LOC100039574 LOC100039753
## rowData names(0):
## colnames(358): ERR525589Aligned ERR525592Aligned ... SRR1044043Aligned
##   SRR1044044Aligned
## colData names(3): label.main label.fine label.ont
mm10_ann_2 <- SingleR(test = scRNA_combined_mm10[['integrated']]@data, 
                      ref = mRNAseq.se, 
                      assay.type.test=1,
                      labels = mRNAseq.se$label.main,
                      clusters = scRNA_combined_mm10@meta.data[["seurat_clusters"]])

mm10_ann_2
## DataFrame with 14 rows and 4 columns
##                               scores      labels delta.next pruned.labels
##                             <matrix> <character>  <numeric>   <character>
## 0   0.625067:0.3236409:0.1017131:... Fibroblasts  0.0654643   Fibroblasts
## 1   0.171448:0.0543392:0.5195881:... Macrophages  0.0850762   Macrophages
## 2   0.629893:0.3062046:0.0609768:... Fibroblasts  0.5194622   Fibroblasts
## 3   0.622290:0.3135865:0.0582042:... Fibroblasts  0.0562530   Fibroblasts
## 4   0.591171:0.2940385:0.0624225:... Fibroblasts  0.4954680   Fibroblasts
## ...                              ...         ...        ...           ...
## 9   0.0653065:0.0483964:0.536450:...    NK cells 0.01491557      NK cells
## 10  0.5634213:0.2909614:0.196302:... Fibroblasts 0.10017724   Fibroblasts
## 11  0.1781777:0.0493389:0.363372:... Macrophages 0.07486630   Macrophages
## 12  0.1327699:0.0769117:0.506982:... Macrophages 0.02656899   Macrophages
## 13  0.0881576:0.0415777:0.491922:...   Monocytes 0.00661534     Monocytes

Cell population labeling

new_clusters_mm10 <- c("Fibroblasts 1","Macrophages 1","Fibroblasts 2","Fibroblasts 3","Fibroblasts 4","Endothelial cells","Fibroblasts 5","Monocytes 1","Fibroblasts 6"," NK cells","Fibroblasts 7","Macrophages 2","Macrophages 3","Monocytes 2")

names(new_clusters_mm10) <- levels(scRNA_combined_mm10)

scRNA_combined_mm10 <- RenameIdents(scRNA_combined_mm10, new_clusters_mm10)

DimPlot(scRNA_combined_mm10, reduction="umap", label=T, label.box=F, label.size = 3.5) 

Figure preparation

Quality controls

ggarrange(plot_qc_hg38_1, plot_qc_hg38_2, plot_qc_mm10_1, plot_qc_mm10_2, common.legend = T, legend="right", labels=c("A","B","C","D"))
## Warning: Removed 50 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 50 rows containing missing values (`geom_point()`).
## Warning: Removed 50 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 50 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 10 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).

Integration before and after for hg19 and mm10

ggarrange(p1_hg38, p2_hg38, p1_mm10, p2_mm10, common.legend = T, legend="right", labels=c("A","B","C","D"))

Clustering

ggarrange(plot_hg19_clustering, plot_hg19_heatmap, legend="right", labels=c("A","B"))

ggarrange(plot_mm10_clustering, plot_mm10_heatmap, legend="right", labels=c("A","B"))

Save data

We will store the relevant objects to use them in other analyses

# First let's store cluster identity in the meta.data slot

scRNA_combined@meta.data$cluster_name <- scRNA_combined@active.ident
scRNA_combined_mm10@meta.data$cluster_name <- scRNA_combined_mm10@active.ident

# Now save the objects in a rds file
saveRDS(scRNA_combined, "data/R_objects/hg19_RNA.rds")
saveRDS(scRNA_combined_mm10, "data/R_objects/mm10_RNA.rds")

We will also save markers for cell annotation of scChIP data

write.xlsx(file="data/R_objects/markers_RNA_hg19.xlsx", hg19_markers)

write.xlsx(file="data/R_objects/markers_RNA_mm10.xlsx", mm10_markers)

Session info

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Spanish_Spain.utf8  LC_CTYPE=Spanish_Spain.utf8   
## [3] LC_MONETARY=Spanish_Spain.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=Spanish_Spain.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] celldex_1.8.0               openxlsx_4.2.5.1           
##  [3] ggpubr_0.5.0                org.Mm.eg.db_3.16.0        
##  [5] org.Hs.eg.db_3.16.0         AnnotationDbi_1.60.0       
##  [7] clusterProfiler_4.6.0       tibble_3.1.8               
##  [9] msigdbr_7.5.1               SingleR_2.0.0              
## [11] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [13] GenomicRanges_1.50.1        GenomeInfoDb_1.34.3        
## [15] IRanges_2.32.0              S4Vectors_0.36.0           
## [17] BiocGenerics_0.44.0         MatrixGenerics_1.10.0      
## [19] matrixStats_0.63.0          ggplot2_3.4.0              
## [21] scCATCH_3.2.1               purrr_0.3.5                
## [23] SeuratObject_4.1.3          Seurat_4.3.0               
## [25] stringr_1.4.1               dplyr_1.0.10               
## [27] R.utils_2.12.2              R.oo_1.25.0                
## [29] R.methodsS3_1.8.2          
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                scattermore_0.8              
##   [3] tidyr_1.2.1                   bit64_4.0.5                  
##   [5] knitr_1.41                    irlba_2.3.5.1                
##   [7] DelayedArray_0.24.0           data.table_1.14.6            
##   [9] KEGGREST_1.38.0               RCurl_1.98-1.9               
##  [11] generics_0.1.3                ScaledMatrix_1.6.0           
##  [13] cowplot_1.1.1                 RSQLite_2.2.19               
##  [15] shadowtext_0.1.2              RANN_2.6.1                   
##  [17] future_1.29.0                 bit_4.0.5                    
##  [19] enrichplot_1.18.1             spatstat.data_3.0-0          
##  [21] httpuv_1.6.6                  assertthat_0.2.1             
##  [23] viridis_0.6.2                 xfun_0.35                    
##  [25] hms_1.1.2                     jquerylib_0.1.4              
##  [27] babelgene_22.9                evaluate_0.18                
##  [29] promises_1.2.0.1              fansi_1.0.3                  
##  [31] progress_1.2.2                dbplyr_2.2.1                 
##  [33] igraph_1.3.5                  DBI_1.1.3                    
##  [35] htmlwidgets_1.5.4             spatstat.geom_3.0-3          
##  [37] ellipsis_0.3.2                backports_1.4.1              
##  [39] deldir_1.0-6                  sparseMatrixStats_1.10.0     
##  [41] vctrs_0.5.1                   ROCR_1.0-11                  
##  [43] abind_1.4-5                   cachem_1.0.6                 
##  [45] withr_2.5.0                   ggforce_0.4.1                
##  [47] HDO.db_0.99.1                 progressr_0.11.0             
##  [49] sctransform_0.3.5             treeio_1.22.0                
##  [51] prettyunits_1.1.1             goftest_1.2-3                
##  [53] cluster_2.1.4                 DOSE_3.24.2                  
##  [55] ExperimentHub_2.6.0           ape_5.6-2                    
##  [57] lazyeval_0.2.2                crayon_1.5.2                 
##  [59] spatstat.explore_3.0-5        labeling_0.4.2               
##  [61] pkgconfig_2.0.3               tweenr_2.0.2                 
##  [63] vipor_0.4.5                   nlme_3.1-160                 
##  [65] rlang_1.0.6                   globals_0.16.2               
##  [67] lifecycle_1.0.3               miniUI_0.1.1.1               
##  [69] downloader_0.4                filelock_1.0.2               
##  [71] BiocFileCache_2.6.0           rsvd_1.0.5                   
##  [73] AnnotationHub_3.6.0           ggrastr_1.0.1                
##  [75] polyclip_1.10-4               lmtest_0.9-40                
##  [77] Matrix_1.5-1                  aplot_0.1.9                  
##  [79] carData_3.0-5                 zoo_1.8-11                   
##  [81] beeswarm_0.4.0                ggridges_0.5.4               
##  [83] png_0.1-7                     viridisLite_0.4.1            
##  [85] bitops_1.0-7                  gson_0.0.9                   
##  [87] KernSmooth_2.23-20            Biostrings_2.66.0            
##  [89] blob_1.2.3                    DelayedMatrixStats_1.20.0    
##  [91] qvalue_2.30.0                 parallelly_1.32.1            
##  [93] spatstat.random_3.0-1         rstatix_0.7.1                
##  [95] gridGraphics_0.5-1            ggsignif_0.6.4               
##  [97] beachmat_2.14.0               scales_1.2.1                 
##  [99] memoise_2.0.1                 magrittr_2.0.3               
## [101] plyr_1.8.8                    ica_1.0-3                    
## [103] zlibbioc_1.44.0               compiler_4.2.2               
## [105] scatterpie_0.1.8              RColorBrewer_1.1-3           
## [107] fitdistrplus_1.1-8            cli_3.4.1                    
## [109] XVector_0.38.0                listenv_0.8.0                
## [111] patchwork_1.1.2               pbapply_1.6-0                
## [113] MASS_7.3-58.1                 tidyselect_1.2.0             
## [115] stringi_1.7.8                 highr_0.9                    
## [117] yaml_2.3.6                    GOSemSim_2.24.0              
## [119] BiocSingular_1.14.0           ggrepel_0.9.2                
## [121] grid_4.2.2                    sass_0.4.4                   
## [123] fastmatch_1.1-3               tools_4.2.2                  
## [125] future.apply_1.10.0           parallel_4.2.2               
## [127] rstudioapi_0.14               gridExtra_2.3                
## [129] farver_2.1.1                  Rtsne_0.16                   
## [131] ggraph_2.1.0                  BiocManager_1.30.19          
## [133] digest_0.6.30                 shiny_1.7.3                  
## [135] Rcpp_1.0.9                    car_3.1-1                    
## [137] broom_1.0.1                   BiocVersion_3.16.0           
## [139] later_1.3.0                   RcppAnnoy_0.0.20             
## [141] httr_1.4.4                    colorspace_2.0-3             
## [143] tensor_1.5                    reticulate_1.26              
## [145] splines_4.2.2                 uwot_0.1.14                  
## [147] yulab.utils_0.0.5             tidytree_0.4.1               
## [149] spatstat.utils_3.0-1          graphlayouts_0.8.4           
## [151] sp_1.5-1                      ggplotify_0.1.0              
## [153] plotly_4.10.1                 xtable_1.8-4                 
## [155] jsonlite_1.8.3                ggtree_3.6.2                 
## [157] tidygraph_1.2.2               ggfun_0.0.9                  
## [159] R6_2.5.1                      pillar_1.8.1                 
## [161] htmltools_0.5.3               mime_0.12                    
## [163] glue_1.6.2                    fastmap_1.1.0                
## [165] BiocParallel_1.32.1           interactiveDisplayBase_1.36.0
## [167] codetools_0.2-18              fgsea_1.24.0                 
## [169] utf8_1.2.2                    lattice_0.20-45              
## [171] bslib_0.4.1                   spatstat.sparse_3.0-0        
## [173] curl_4.3.3                    ggbeeswarm_0.6.0             
## [175] leiden_0.4.3                  zip_2.2.2                    
## [177] GO.db_3.16.0                  limma_3.54.0                 
## [179] survival_3.4-0                rmarkdown_2.18               
## [181] munsell_0.5.0                 GenomeInfoDbData_1.2.9       
## [183] reshape2_1.4.4                gtable_0.3.1